#######################################################
####                                               ####
####  Input: MP-by-debate data for analysis        ####
####  Output: Various validation checks            ####
####                                               ####
#######################################################

# Load libraries

library(quanteda) # v3.3.1
library(data.table) # v1.14.8
library(plyr) # v1.8.9
library(ggplot2) # v3.4.3
library(corrplot) # v0.92
library(tidyverse) # v2.0.0
library(broom) # v1.0.5
library(broom.helpers) # v1.14.0
library(estimatr) # v1.0.0

load("working/speech_scores.Rdata")

# Frontbench / backbench ministers and issue areas ------------------------------------------------------------

woman_gov <- lm_robust(women_dic_std ~ women_minister_gov + women_minister_opp + women_committee, data = speech_scores,
                       clusters = person_id, se_type = "stata")
health_gov <- lm_robust(health_std ~ health_minister_gov + health_minister_opp + health_committee, data = speech_scores,
                 clusters = person_id, se_type = "stata")
education_gov <- lm_robust(education_std ~ education_minister_gov + education_minister_opp + education_committee, data = speech_scores,
                    clusters = person_id, se_type = "stata")
children_gov <- lm_robust(children_std ~ family_committee, data = speech_scores,
                   clusters = person_id, se_type = "stata")
welfare_gov <- lm_robust(social_std ~ welfare_minister_gov + welfare_minister_opp + welfare_committee, data = speech_scores,
                  clusters = person_id, se_type = "stata")
environment_gov <- lm_robust(environment_std ~ environment_minister_gov + environment_minister_opp + environment_committee, data = speech_scores,
                             clusters = person_id, se_type = "stata")
defence_gov <- lm_robust(defence_std ~ defence_minister_gov + defence_minister_opp + defence_committee, data = speech_scores,
                         clusters = person_id, se_type = "stata")
economy_gov <- lm_robust(economy_std ~ economy_minister_gov + economy_minister_opp + economy_committee, data = speech_scores,
                         clusters = person_id, se_type = "stata")
agriculture_gov <- lm_robust(agriculture_std ~ agriculture_minister_gov + agriculture_minister_opp + agriculture_committee, data = speech_scores,
                             clusters = person_id, se_type = "stata")
trade_gov <- lm_robust(trade_std ~ trade_minister_gov + trade_minister_opp + trade_committee, data = speech_scores,
                       clusters = person_id, se_type = "stata")
crime_gov <- lm_robust(crime_std ~ crime_minister_gov + crime_minister_opp + crime_committee, data = speech_scores,
                       clusters = person_id, se_type = "stata")
transport_gov <- lm_robust(transport_std ~ transport_minister_gov + transport_minister_opp + transport_committee, data = speech_scores,
                           clusters = person_id, se_type = "stata")

covariates_for_position_plot <- bind_rows(
  tidy(woman_gov, conf.int = T)[2:4,]%>%mutate(outcome="Women"),
  tidy(health_gov, conf.int = T)[2:4,]%>%mutate(outcome="Health"),
  tidy(education_gov, conf.int = T)[2:4,]%>%mutate(outcome="Education"),
  tidy(welfare_gov, conf.int = T)[2:4,]%>%mutate(outcome="Welfare"),
  tidy(children_gov, conf.int = T)[2,]%>%mutate(outcome="Children & Family"),
  tidy(environment_gov, conf.int = T)[2:4,]%>%mutate(outcome="Environment"),
  tidy(defence_gov, conf.int = T)[2:4,]%>%mutate(outcome="Defence"),
  tidy(economy_gov, conf.int = T)[2:4,]%>%mutate(outcome="Finance & Economy"),
  tidy(agriculture_gov, conf.int = T)[2:4,]%>%mutate(outcome="Agriculture"),
  tidy(trade_gov, conf.int = T)[2:4,]%>%mutate(outcome="Trade"),
  tidy(crime_gov, conf.int = T)[2:4,]%>%mutate(outcome="Crime & Policing"),
  tidy(transport_gov, conf.int = T)[2:4,]%>%mutate(outcome="Transport")
)

pdf("analysis/plots/figure_S1.pdf", 11.5,6)
ggplot(covariates_for_position_plot,aes(y=term,x=estimate))+
  geom_point()+
  ylab("") + xlab("") +
  geom_vline(xintercept = 0, linetype = 2) +
  geom_pointrange(aes(xmin=conf.low,xmax=conf.high))+
  theme_bw()+
  facet_wrap(~outcome,scales="free") +
  scale_y_discrete(labels=c("Committee \n Member", "Government \n Minister", "Shadow \n Minister")) +
  theme(axis.text = element_text(size = 8),
        axis.title = element_text(size = 9),
        plot.title =  element_text(size = 10, hjust = 0.5))
dev.off()


# Topics within debates

health_debate <- lm_robust(health_std ~ health_debate, data = speech_scores, clusters = person_id, se_type = "stata")
women_debate <- lm_robust(women_dic_std ~ women_debate, data = speech_scores, clusters = person_id, se_type = "stata")
education_debate <- lm_robust(education_std ~ education_debate, data = speech_scores, clusters = person_id, se_type = "stata")
welfare_debate <- lm_robust(social_std ~ welfare_debate, data = speech_scores, clusters = person_id, se_type = "stata")
environment_debate <- lm_robust(environment_std ~ environment_debate, data = speech_scores, clusters = person_id, se_type = "stata")
children_debate <- lm_robust(children_std ~ children_debate, data = speech_scores, clusters = person_id, se_type = "stata")
defence_debate <- lm_robust(defence_std ~ defence_debate, data = speech_scores, clusters = person_id, se_type = "stata")
economy_debate <- lm_robust(economy_std ~ economy_debate, data = speech_scores, clusters = person_id, se_type = "stata")
agriculture_debate <- lm_robust(agriculture_std ~ agriculture_debate, data = speech_scores, clusters = person_id, se_type = "stata")
trade_debate <- lm_robust(trade_std ~ trade_debate, data = speech_scores, clusters = person_id, se_type = "stata")
crime_debate <- lm_robust(crime_std ~ crime_debate, data = speech_scores, clusters = person_id, se_type = "stata")
transport_debate <- lm_robust(transport_std ~ transport_debate, data = speech_scores, clusters = person_id, se_type = "stata")

covariates_for_debate_plot <- bind_rows(
  tidy(women_debate, conf.int = T)[2,]%>%mutate(outcome="Women"),
  tidy(health_debate, conf.int = T)[2,]%>%mutate(outcome="Health"),
  tidy(education_debate, conf.int = T)[2,]%>%mutate(outcome="Education"),
  tidy(welfare_debate, conf.int = T)[2,]%>%mutate(outcome="Welfare"),
  tidy(children_debate, conf.int = T)[2,]%>%mutate(outcome="Children & Family"),
  tidy(environment_debate, conf.int = T)[2,]%>%mutate(outcome="Environment"),
  tidy(defence_debate, conf.int = T)[2,]%>%mutate(outcome="Defence"),
  tidy(economy_debate, conf.int = T)[2,]%>%mutate(outcome="Finance & Economy"),
  tidy(agriculture_debate, conf.int = T)[2,]%>%mutate(outcome="Agriculture"),
  tidy(trade_debate, conf.int = T)[2,]%>%mutate(outcome="Trade"),
  tidy(crime_debate, conf.int = T)[2,]%>%mutate(outcome="Crime & Policing"),
  tidy(transport_debate, conf.int = T)[2,]%>%mutate(outcome="Transport")
)

pdf("analysis/plots/figure_S2.pdf",11.5,6)
ggplot(covariates_for_debate_plot,aes(y=term, x=estimate))+
  geom_point() +
  geom_pointrange(aes(xmin=conf.low,xmax=conf.high)) +
  geom_vline(xintercept = 0, linetype = 2) +
  ylab("") + xlab("") +
  theme_bw() +
  theme(legend.position = "none") +
  facet_wrap(~outcome,scales="free") +
  scale_y_discrete(labels=c("Issue \n Debate")) +
  theme(axis.text = element_text(size = 8),
        axis.title = element_text(size = 9),
        plot.title =  element_text(size = 10, hjust = 0.5))
dev.off()

# Debate titles for debates with speeches where issues are raised most

load("working/dictionaries_sentence_issues.Rdata")
load("data/debates.Rdata")

## Drop all debates with fewer than 5 participants
ep_id_to_person_debate_ids <- debates[,list(person_id = unique(person_id),
                                            section_id = unique(section_id),
                                            parent = unique(parent)),by = c("person_id", "section_id")]

## Drop question time
ep_id_to_person_debate_ids$question_time <- grepl("Oral answers to question",ep_id_to_person_debate_ids$parent, ignore.case = T)
ep_id_to_person_debate_ids <- ep_id_to_person_debate_ids[ep_id_to_person_debate_ids$question_time==FALSE,]

debates_for_validation <- ep_id_to_person_debate_ids[,list(parent = unique(parent)), by = section_id]

debate_scores <- dictionary_scores_issues[, list(glove_defence = weighted.mean(glove_defence/n_words, n_words),
                                                 glove_economy = weighted.mean(glove_economy/n_words, n_words),
                                                 glove_agriculture = weighted.mean(glove_agriculture/n_words, n_words),
                                                 glove_health = weighted.mean(glove_health/n_words, n_words),
                                                 glove_children = weighted.mean(glove_children/n_words, n_words),
                                                 glove_education = weighted.mean(glove_education/n_words, n_words),
                                                 glove_social = weighted.mean(glove_social/n_words, n_words),
                                                 glove_trade = weighted.mean(glove_trade/n_words, n_words),
                                                 glove_environment = weighted.mean(glove_environment/n_words, n_words),
                                                 glove_energy = weighted.mean(glove_energy/n_words, n_words),
                                                 glove_crime = weighted.mean(glove_crime/n_words, n_words),
                                                 glove_transport = weighted.mean(glove_transport/n_words, n_words)),
                                          by = section_id]

debate_validation <- merge(debates_for_validation, debate_scores, by = c("section_id"))

defence_debates <- debate_validation[order(glove_defence, decreasing = TRUE)[1:20], c("parent")]
economy_debates <- debate_validation[order(glove_economy, decreasing = TRUE)[1:20], c("parent")]
agriculture_debates <- debate_validation[order(glove_agriculture, decreasing = TRUE)[1:20], c("parent")]
health_debates <- debate_validation[order(glove_health, decreasing = TRUE)[1:20], c("parent")]
children_debates <- debate_validation[order(glove_children, decreasing = TRUE)[1:20], c("parent")]
education_debates <- debate_validation[order(glove_education, decreasing = TRUE)[1:20], c("parent")]
welfare_debates <- debate_validation[order(glove_social, decreasing = TRUE)[1:20], c("parent")]
trade_debates <- debate_validation[order(glove_trade, decreasing = TRUE)[1:20], c("parent")]
environment_debates <- debate_validation[order(glove_environment, decreasing = TRUE)[1:20], c("parent")]
crime_debates <- debate_validation[order(glove_crime, decreasing = TRUE)[1:20], c("parent")]
transport_debates <- debate_validation[order(glove_transport, decreasing = TRUE)[1:20], c("parent")]
energy_debates <- debate_validation[order(glove_energy, decreasing = TRUE)[1:20], c("parent")]

save(defence_debates, economy_debates, agriculture_debates, health_debates, children_debates,
     education_debates, welfare_debates, trade_debates, environment_debates,
     crime_debates, transport_debates, energy_debates,
     file = "working/validation/top_debates.Rdata")

# Top sentences and topic validation ----------------------------------------------------------------

dictionary_scores <- dictionary_scores_issues

## Cut to only 100 words or less
dictionary_scores <- dictionary_scores[dictionary_scores$n_words<=25,]

top_defence <- dictionary_scores[order(dictionary_scores$glove_defence, decreasing = TRUE)[1:150], c("sent", "glove_defence")]
top_economy <- dictionary_scores[order(dictionary_scores$glove_economy, decreasing = TRUE)[1:150], c("sent", "glove_economy")]
top_agriculture <- dictionary_scores[order(dictionary_scores$glove_agriculture, decreasing = TRUE)[1:150], c("sent", "glove_agriculture")]
top_health <- dictionary_scores[order(dictionary_scores$glove_health, decreasing = TRUE)[1:150], c("sent", "glove_health")]
top_children <- dictionary_scores[order(dictionary_scores$glove_children, decreasing = TRUE)[1:150], c("sent", "glove_children")]
top_education <- dictionary_scores[order(dictionary_scores$glove_education, decreasing = TRUE)[1:150], c("sent", "glove_education")]
top_social <- dictionary_scores[order(dictionary_scores$glove_social, decreasing = TRUE)[1:150], c("sent", "glove_social")]
top_trade <- dictionary_scores[order(dictionary_scores$glove_trade, decreasing = TRUE)[1:150], c("sent", "glove_trade")]
top_environment <- dictionary_scores[order(dictionary_scores$glove_environment, decreasing = TRUE)[1:150], c("sent", "glove_environment")]
top_energy <- dictionary_scores[order(dictionary_scores$glove_energy, decreasing = TRUE)[1:150], c("sent", "glove_energy")]
top_crime <- dictionary_scores[order(dictionary_scores$glove_crime, decreasing = TRUE)[1:150], c("sent", "glove_crime")]
top_transport <- dictionary_scores[order(dictionary_scores$glove_transport, decreasing = TRUE)[1:150], c("sent", "glove_transport")]

save(top_defence, top_economy, top_agriculture, top_health, top_children,
     top_education, top_social, top_trade, top_environment, top_energy,
     top_crime, top_transport, file = "working/validation/top_speeches.Rdata")

# Measure correlation ---------------------------------------------------------------------

speech_cor <- cor(speech_scores[,c("defence_std", "economy_std", "agriculture_std", "health_std", "children_std",
                                   "education_std", "trade_std", "environment_std",  "crime_std", "transport_std",
                                   "social_std")], use = "complete.obs")
colnames(speech_cor) <- rownames(speech_cor) <- c("Defence", "Finance & Economy", "Agriculture", "Health", "Children & Family", "Education",
                                                  "Trade", "Environment", "Crime & Policing", "Transport", "Welfare")

pdf("analysis/plots/figure_S3.pdf",6,6)
par(mfrow = c(1,1))
corrplot::corrplot(speech_cor, title = "", method = "ellipse", diag = F, addgrid.col=NA,
                   addCoef.col="black", tl.col="black",number.cex=0.5,tl.cex = 0.7,cl.cex = 0.7,number.digits=1,order="FPC", mar = c(2,2,4,2)+0.1)
dev.off()

# Distribution of experience ---------------------------------------------------------------

hist_data <- speech_scores %>%
  group_by(person_id, gender) %>%
  summarise(max = max(years_in_parliament, na.rm=T))

hist_data$mean <- NA
hist_data$mean[hist_data$gender=="Female"] <- median(hist_data$max[hist_data$gender=="Female"])
hist_data$mean[hist_data$gender=="Male"] <- median(hist_data$max[hist_data$gender=="Male"])
hist_data$gender <- ifelse(hist_data$gender=="Male", "Men MPs", "Women MPs")

women_only <- hist_data[hist_data$gender=="Women MPs",]
women_only$person_id[women_only$max >= 20]

men_only <- hist_data[hist_data$gender=="Men MPs",]
men_only$person_id[men_only$max >= 20]

experience_histogram <- ggplot(hist_data, aes(x=max, color = gender)) +
  geom_histogram(binwidth = 2, aes(fill=gender,  col=gender)) +
  theme_bw() +
  scale_fill_manual(values = c(alpha("#0594AD", 0.8),alpha("#E67700", 0.8))) +
  scale_color_manual(values = c("#0594AD","#E67700")) +
  geom_vline(data=hist_data, (aes(xintercept=mean)), linetype="dashed", col = "black") +
  facet_wrap(~gender) + 
  theme(legend.position = "none", 
        axis.title=element_text(size=11), 
        strip.text = element_text(size = 11), 
        axis.text = element_text(size = 10)) +
  labs(x="Years in Parliament", y="Distribution of MPs") +
  scale_y_continuous(limits = c(0,150), breaks = seq(0, 150, by = 25), expand=c(0.03,0)) 

ggsave(experience_histogram, file = "analysis/plots/figure_2.pdf", width=6.5, height=3.5)

# Gendered debate participation -----------------------------------------------------------

debate_level <- speech_scores %>%
  group_by(section_id) %>%
  summarise(prop_women = max(prop_women, na.rm=T))

debate_level$prop_women <- debate_level$prop_women *100
debate_level$mean <- mean(debate_level$prop_women)

propotion_women <- ggplot(debate_level, aes(prop_women)) +
    geom_histogram(binwidth = 5, col = "#44D7A8", fill = alpha("#44D7A8", 0.7)) + 
  theme_bw() +
labs(x="Percentage of women speaking in a debate", y="") +
  scale_y_continuous(limits = c(0,8000), breaks = seq(0, 8000, by = 2000)) +
  geom_vline(data=debate_level, (aes(xintercept=mean)), linetype="dashed", col = "black") +
  theme(legend.position = "none", 
        axis.title=element_text(size=10), 
        axis.text = element_text(size = 9))
ggsave(propotion_women, file = "analysis/plots/figure_S5.pdf", width = 6.5, height = 3.5)
